# Data processing
import numpy as np
import pandas as pd
# Stats
seed = np.random.SeedSequence(42)
rng = np.random.default_rng(seed=seed)
import pystan
import scipy.stats as st
from statsmodels.distributions.empirical_distribution import ECDF
# Data viz
import bokeh.io
import bokeh.plotting
bokeh.io.output_notebook()
# Helper functions
from stan_workup import summarize, predictive_regression
This notebook contains code used to generate estimates for enzymatic parameters for Gordon Rix et al., 2020.
The goal is to estimate $k_\text{cat}$ for each enzyme and $K_\text{M}$ for indole. Reaction mixtures of indole were prepared via serial dilution (where possible) to obtain the following concentration values:
concs = [500, 400, 300, 200, 100, 50, 25, 12.5, 6.25, 3.125, 1.5625, 0] # µM
with a constant concentration of 40 mM serine in 50 mM potassium phosphate buffer, pH 8.0, at 30 °C.
Enzyme was added (50 nM) and the absorbance at 290 nm was immediately recorded to observe the initial velocity of the reaction. For very low indole concentrations the velocities could be seen to level off over time (expecially when >50 nM enzyme was used) as all of the indole is consumed, so only the first minute (really 10–70 seconds) was kept to determine the intitial velocity. Although the values can be noisy, this is accounted for during the Bayesian inference process by propogating the error in the measurements during the whole process.
The absorbance difference between indole and tryptophan is known at 290 nm, allowing units of AU/time to be converted to mM/time, providing a specific rate. Since the enzyme concentration is also known, we can convert each velocity measurement to an enzyme-normalized rate.
Stan uses a generative modeling strategy to model the system and data and provide inferences. In other words, if one knows a reasonable model of the data-generating process, Stan will use a probabilitic approach to providing estimates of each parameter in the system. In this case, the data-generating process is as follows: at each indole concentration, a rate is sampled from the theoretical Michaelis-Menten curve, with some error (assumed to be Normal). If the Michaelis-Menten equation is of the form:
\begin{align} k = \frac{k_\text{cat}\text{[indole]}}{K_\text{M} + \text{[indole]}}, \end{align}then its generative distribution can be written as:
\begin{align} k \sim \text{Norm}(\frac{k_\text{cat}\text{[indole]}}{K_\text{M} + \text{[indole]}}, σ_k), \end{align}where $σ_k$ is the standard deviation for the Michaelis-Menten generative distribution.
Rates are not measured perfectly, however, and so each rate that is measured has its own data-generating process, depending on hardware limitations (i.e., the lamp and detector on the UV-Vis spectrophotometer) and other factors. The rates are measured as the conversion of indole to tryptophan over time (which is measured via absorbance) and should be linear, and so are simply linear regressions of the absorbance values over time, with deviations from the theoretical slope according to some normal distribution. (An alternative solution would be to model this as a stochastic differential equation.) As an additional factor, the noise of a spectrophotometer is typically heteroscedastic for large changes in absorbance, so each rate measurement is assigned its own standard deviation (as the noise should be greater for higher concentration samples than lower). For measuring absorbance over time (which can be converted to true rates as mentioned above), this data-generating process is a follows:
\begin{align} a_t = a \times t + a_0, \end{align}where $a$ is the rate of absorbance change, $t$ is the time (in seconds), and $a_0$ is absorbance at $t=0$, and each rate is modeled as
\begin{align} a \sim \text{Norm}(m, σ_a), \end{align}where $m$ is the estimated mean slope and $σ_a$ is the sample-specific standard deviation from the mean slope.
Bayesian inference require "priors" for the estimated parameters ($k_\text{cat}$ and $K_\text{M}$), which should be large-and non-specific, but reasonably limit the starting search space for the Markov chain Monte Carlo (MCMC) sampler (more on this below). In other words, it places initial probability density over reasonable values for these parameters, according to your previous knowledge. As long as your posterior distributions for each parameter are much more constrained than your priors, it is unlikely that your prior choices are strongly influencing your results. For $k_\text{cat}$ and $K_\text{M}$ we choose large lognormal priors, which are useful as they constrain the values to be >0 (which they will be) and, in practice, these parameters vary in log-space rather than linear-space. That is, rates and rate constants vary multiplicatively as the energy of the system varies additively, so a lognormal prior better approximates what we know about these parameters. Finally, as Stan's sampler is very efficient, as long as our model is not exceptionally complex or pathological, a reasonable posterior distribution will be found as long as the prior probability is not identically zero. (Note that is somewhat of an oversimplification.)
The sampling procedure will pick initial parameter guesses from your prior distributions and test how well your data agree with those parameter guesses in the scope of your model. This is stored as a sample, assuming we are out of a warm-up stage. Then new parameter guesses will be made and the sampler will see how well those agree. If they are better, then they will be accepted and stored. In they are worse, they will be accepted and stored with some probability which depends on how reasonable the guess it. This is the basis of Markov chain Monte Carlo (MCMC) sampling, and it can provide an estimation of your posterior distribution, which is a quantification of your uncertainty in your parameter estimations. These samples are ideally drawn independently and so are not correlated (which can be checked in the summary and diagnostics of the samples).
With the samples, you can compute summary statistics about your parameters, such as mean, median, and credible regions, and see how predictive your parameters are for the data. This is shown in the examples below.
To confirm that the model returns values that we expect, we can provide generated data (according to our data-generating process) and send it through Stan. The values we output should be reasonably close to the input values.
# Define initial parameters
k_cat = 1 # per sec
K_M = 10 # µM
sigma_k = 0.1
# Concentrations in duplicate
# Needs to be sorted and have the zeros as the first entries
concs = np.array(sorted([500, 400, 300, 200, 100, 50, 25, 12.5, 6.25, 3.125, 1.5625, 0]*2)) # µM
# Set up function to generate data
def MM(k_cat, K_M, conc, sigma=0):
"""Theoretical Michaelis-Menten equation, using
absolute parameters with units of µM, seconds.
"""
k = (k_cat*conc)/(K_M + conc)
noisy_k = rng.normal(k, sigma)
return noisy_k
# Generate data
ks = MM(k_cat, K_M, concs, sigma_k)
# Check
ks
Looks okay, let's plot it and see.
theor_x = np.linspace(0, 500, 1000)
theor_k = MM(k_cat, K_M, theor_x, sigma=0)
p = bokeh.plotting.figure(plot_width=500, plot_height=400)
p.xaxis.axis_label = '[indole] (µM)'
p.yaxis.axis_label = 'k (per sec)'
p.line(theor_x, theor_k, color='black', line_width=2)
p.circle(concs, ks, size=8, fill_alpha=0.5)
bokeh.io.show(p)
Looks good. We'll now take these rates and convert them to the values we actually get: change in absorbance over time.
# Unit conversions
c_Enz = 50 # nM
epsilon = 1.89 # mM/sec
# Convert rate to specific rate
vs = ks*(c_Enz/1000) # µM per sec
# Convert µM per sec to mAU per sec
a_rates = vs*epsilon # mAU per sec
# Check
a_rates
With these we can now generate noisy absorbance values over time. We'll demonstrate for one of them first, and then do it with all of them.
# Take a 500 µM case
working_rate = a_rates[-1]
# Generate some times; 0.2 second increments
times = np.arange(0, 60, 0.2)
# Noise; use as a function of a_rate
sigma_a_factor = 5
def a_t(a_rate, time, a_0=0, sigma_factor=0):
"""Creates a noisy timecourse based on a slope and
times, and an optional y-intercept and noise factor.
"""
a_t = a_rate*time + a_0
noisy_a_t = rng.normal(a_t, abs(sigma_factor*a_rate)+0.05)
return noisy_a_t
# Get a timecourse
a_ts = a_t(working_rate, times, sigma_factor=sigma_a_factor)
# Plot
p = bokeh.plotting.figure(plot_width=500, plot_height=400)
p.xaxis.axis_label = 'time (sec)'
p.yaxis.axis_label = 'mAU'
p.line(times, a_ts, line_width=1)
bokeh.io.show(p)
Very reasonable. Now let's plot each.
# Set up full milliabsorbance matrix, ordered by concs
mabs = np.array([
a_t(working_rate, times, sigma_factor=sigma_a_factor)
for working_rate in a_rates
])
p = bokeh.plotting.figure(plot_width=800, plot_height=400)
p.xaxis.axis_label = 'time (s)'
p.yaxis.axis_label = 'mAU'
# Set up color dict
colors = bokeh.palettes.magma(len(concs))[::2]
color_dict = {
conc: color for conc, color in zip(np.unique(concs), colors)
}
for i, conc in enumerate(concs):
mab = mabs[i]
p.line(times, mab, color=color_dict[conc], legend_label=str(conc))
bokeh.io.show(p)
Now we can send these values through the Stan sampler and see if we get reasonable estimates of our original parameters ($k_\text{cat}$ = 1, $K_\text{M}$ = 10).
Let's first compile and display the model, which is written according to what was specified in 'Generative Distributions in Stan'.
# Create the model and display it
model = pystan.StanModel('MM_model.stan')
print(model)
Now we can prepare the data and do some sampling.
# Multiply absorbances by 100 to give rates of ~1
# This speeds up modeling, and is later factored in
scaling_factor = 100
scaled_absorbances = mabs*scaling_factor
# Store as dictionary
data = dict(
N=len(times),
M=len(concs),
M0=sum(concs == 0),
t=times,
conc=concs,
a=scaled_absorbances,
scaling_factor=scaling_factor,
epsilon=epsilon,
c_Enz=c_Enz,
max_conc=int(max(concs)),
conc_ppc=range(0, int(max(concs)+1)),
)
# Sample
samples = model.sampling(data)
# Check out a summary
samples
# Convert to a dataframe
df_stan = samples.to_dataframe()
# Check the results
summarize(df_stan['k_cat'], units='per sec')
summarize(df_stan['K_M'], units='µM')
The analysis looks good. We notice a few things:
The priors were lognormal (and absolutely enormous) to start. The posterior for $k_\text{cat}$ now looks normal, which means we have enough data to overwhelm the prior and have our samples be distributed more akin to the model (which assumes a normal distribution for $k_\text{cat}$). On the other hand, $K_\text{M}$ is still a bit lognormal, as evidenced by the tail toward higher values. This results in a larger upper bound for $K_\text{M}$ (50% to 97.5%), but the true value still lies within the 95% credible region. The posterior for $K_\text{M}$ has still been greatly informed by the data, however, and hardly resembles the original prior at all. (See below.)
We can see how our samples for the rates at each indole concentration compare to the original data we created.
# Get each rate sample
rate_samples = [col for col in df_stan.columns if 'rate' in col]
df_rates = df_stan[rate_samples]
# Tidy the data
df_rates = df_rates.melt(var_name='sample', value_name='estimated scaled rate')
# Add in concentration info for each sample
df_rates['[indole] (µM)'] = df_rates['sample'].map({sample: conc for sample, conc in zip(rate_samples, concs)})
# Convert rate to k
df_rates['estimated rate (per sec)'] = (((df_rates['estimated scaled rate']/scaling_factor)
/epsilon)
/(c_Enz/1000))
# Set easier indexing
df_rates.set_index('sample', inplace=True)
# Check
df_rates
theor_x = np.linspace(0, 500, 1000)
theor_k = MM(k_cat, K_M, theor_x, sigma=0)
p = bokeh.plotting.figure(plot_width=500, plot_height=400)
p.xaxis.axis_label = '[indole] (µM)'
p.yaxis.axis_label = 'k (per sec)'
p.line(theor_x, theor_k, color='black', line_width=2, legend_label='theoretical MM curve')
p.circle(concs, ks, size=8, fill_alpha=0.5, legend_label='actual sampled rates')
# Plot the 4000 actual sample values (effectively error bars)
p.square(
df_rates['[indole] (µM)'],
df_rates['estimated rate (per sec)'],
size=2,
fill_color='black',
line_color=None,
fill_alpha=1,
)
# Plot the median values
for sample in df_rates.index.unique():
_df = df_rates.loc[sample]
p.circle(
_df['[indole] (µM)'].median(),
_df['estimated rate (per sec)'].median(),
size=6,
fill_color='orange',
line_color='black',
fill_alpha=0.8,
legend_label='median rate estimates',
)
p.legend.location = 'bottom_right'
bokeh.io.show(p)
Here we see that the points are often a little bit off. This comes a part of the data-generating process in the Stan model that was not used to generate the data, which is background correction. In the full model, it is assumed that backround rate of change for the samples without any indole are the same in all cases (e.g., some sort of background drift). So this value is subtracted from all rate estimates. We see that this isn't the case for the generated data (as we know how we generated it and that the estimates don't align), but that's only because we have the real data. This is impossible to know in a real experiment, thus, we model what we expect to happen in our real experiment as best we can.
Finally, we can use our median estimates for $k_\text{cat}$ and $K_\text{M}$ to plot our median Michaelis-Menten curve estimation (what our theoretical Michaelis-Menten behavior would look like assuming the median estimates were exactly correct).
# Add median estimation
estimated_ks = MM(df_stan['k_cat'].median(), df_stan['K_M'].median(), theor_x, sigma=0)
p.line(theor_x, estimated_ks, color='#004D00', line_width=3, legend_label='median estimated MM curve')
bokeh.io.show(p)
Part of the model was to use the samples to generate posterior predictive checks, which are shown to give an idea of how predictive the estimated parameters for the given data. This is best shown for the theoretical Michaelis-Menten curve in the form of shaded credible regions, which we'll have as 95% (widest region), 75%, 50%, 25% (narrowest region), and the median line (which should align well with the median estimated MM curve above). We'll overlay all the previous data as well, except the median estimate curve.
p = predictive_regression(df_stan, 'k_ppc')
p.line(theor_x, theor_k, color='black', line_width=2, legend_label='theoretical MM curve')
p.circle(concs, ks, size=8, fill_alpha=0.5, legend_label='actual sampled rates')
# Plot the 4000 actual sample values (effectively error bars)
p.square(
df_rates['[indole] (µM)'],
df_rates['estimated rate (per sec)'],
size=2,
fill_color='black',
line_color=None,
fill_alpha=1,
)
# Plot the median values
for sample in df_rates.index.unique():
_df = df_rates.loc[sample]
p.circle(
_df['[indole] (µM)'].median(),
_df['estimated rate (per sec)'].median(),
size=6,
fill_color='orange',
line_color='black',
fill_alpha=0.8,
legend_label='median rate estimates',
)
p.legend.location = 'bottom_right'
bokeh.io.show(p)
(As a note, the jaggedness of the credible regions can be smoothed out by taking more than 4000 samples, but this will take longer and take up much or space in memory or physical memory if the samples are saved.)
We'll quickly look at how much our data informed the priors.
xs = np.linspace(0, 2, 1000)
test = st.lognorm(s=2.5, loc=0, scale=np.log(150)).cdf(xs)
p = bokeh.plotting.figure(plot_width=500, plot_height=400)
p.xaxis.axis_label = 'k_cat'
p.yaxis.axis_label = '(E)CDF'
ecdf = ECDF(df_stan['k_cat'].values)
p.circle(ecdf.x, ecdf.y, legend_label='posterior ECDF')
p.line(xs, test, color='black', line_width=3, legend_label='prior CDF')
bokeh.io.show(p)
xs = np.linspace(0, 50, 1000)
test = st.lognorm(s=1.5, loc=0, scale=np.log(500)).cdf(xs)
p = bokeh.plotting.figure(plot_width=500, plot_height=400)
p.xaxis.axis_label = 'K_M'
p.yaxis.axis_label = '(E)CDF'
ecdf = ECDF(df_stan['K_M'].values)
p.circle(ecdf.x, ecdf.y, legend_label='posterior ECDF')
p.line(xs, test, color='black', line_width=3, legend_label='prior CDF')
bokeh.io.show(p)
Now that we're convinced that we can recover parameters with this model (and assuming we trust the model as a reasonable and useful description of our system), we'll use some actual enzyme kinetics data, as was done to estimate parameters for the paper.
First we'll deal with cleaning up the raw absorbance data.
df = pd.read_excel('example_kinetics_data.xlsx')
df.head()
Clean up the column names.
(Note: multiple columns with the same name in a pandas DataFrame is not recommended; only momentarily using it here to prepare numpy arrays.)
# Instantiate list
new_cols = [None for _ in df.columns]
for i, column in enumerate(df.columns):
# Standardize time label
if column == 'Time ( Second )':
new_cols[i] = 'Time (s)'
# Clean up experimental names
else:
working_string = column.replace(' - RawData', '')
split_string = working_string.split('_')
if len(split_string) == 1:
working_string = split_string[0]
else:
working_string = split_string[2].replace('uM', '')
new_cols[i] = working_string
# Rename the columns
df.columns = new_cols
# Check
df.head()
# t0 subtract and transpose to simplify operations
t0_sub_values = (df.values - df.values[0]).T
# Ignore Time (column 0) and blank (column -1) columns
full_absorbances = t0_sub_values[1:-1]
# Convert AU to mAU
absorbances = full_absorbances*1000
# Check
absorbances
# Plot to check
concs = df.columns[1:-1].astype(float)
times = df['Time (s)'].values
p = bokeh.plotting.figure(plot_width=800, plot_height=400)
p.xaxis.axis_label = 'Time (s)'
p.yaxis.axis_label = 'mAU'
for i, absorbance in enumerate(absorbances):
color = bokeh.palettes.magma(27)
p.line(times, absorbance, color=color[i], legend_label=str(concs[i]))
bokeh.io.show(p)
Here we see that the real data looks remarkably like generated data (with some more real-world inconsistencies), suggesting that our data-generating process (and our overall model) is reasonable. We can proceed.
df_min = df[(df['Time (s)'] >= 10) & (df['Time (s)'] <= 70)].copy()
df_min.head()
# Prepare values
absorbances = (df_min.values - df_min.values[0]).T[1:-1]*1000
concs = df_min.columns[1:-1].astype(float)
times = df_min['Time (s)'].values
# Plot
p = bokeh.plotting.figure(plot_width=800, plot_height=400)
p.xaxis.axis_label = 'Time (s)'
p.yaxis.axis_label = 'mAU'
for i, absorbance in enumerate(absorbances):
color = bokeh.palettes.magma(len(concs)+2)
p.line(times, absorbance, color=color[i], legend_label=str(concs[i]))
bokeh.io.show(p)
# Multiply absorbances by 100 to give rates of ~1
# This speeds up modeling, and is later factored in
scaling_factor = 100
scaled_absorbances = absorbances*scaling_factor
# Enzyme concentration, in nM
c_Enz = 50
# Store as dictionary
data = dict(
N=len(times),
M=len(concs),
M0=sum(concs == 0),
t=times,
conc=concs,
a=scaled_absorbances,
scaling_factor=scaling_factor,
epsilon=1.89,
c_Enz=c_Enz,
max_conc=int(max(concs)),
conc_ppc=range(0, int(max(concs)+1)),
)
# Sample; use more iterations and warm-up for better results
# This should take ~1 minute on a CPU
samples = model.sampling(data)
# Check out a summary
samples
# Convert to a dataframe
df_stan = samples.to_dataframe()
# Check
df_stan
Efficient storage, if needed.
# # Write out a ~20 MB file
# df_stan.to_parquet('df_stan.parquet', compression='gzip')
# # Read in the file
# df_stan = pd.read_parquet('df_stan.parquet')
# # Check again
# df_stan
# Get summary stats
summarize(df_stan['k_cat'], units='per sec')
summarize(df_stan['K_M'], units='µM')
# Get each rate sample
rate_samples = [col for col in df_stan.columns if 'rate' in col]
df_rates = df_stan[rate_samples]
# Tidy the data
df_rates = df_rates.melt(var_name='sample', value_name='estimated scaled rate')
# Add in concentration info for each sample
df_rates['[indole] (µM)'] = df_rates['sample'].map({sample: conc for sample, conc in zip(rate_samples, concs)})
# Convert rate to k
df_rates['estimated rate (per sec)'] = (((df_rates['estimated scaled rate']/scaling_factor)
/epsilon)
/(c_Enz/1000))
# Set easier indexing
df_rates.set_index('sample', inplace=True)
# Generate the predictive regression plot
p = predictive_regression(df_stan, 'k_ppc')
# Plot the 4000 actual sample values (effectively error bars)
p.square(
df_rates['[indole] (µM)'],
df_rates['estimated rate (per sec)'],
size=2,
fill_color='black',
line_color=None,
fill_alpha=1,
)
# Plot the median values
for sample in df_rates.index.unique():
_df = df_rates.loc[sample]
p.circle(
_df['[indole] (µM)'].median(),
_df['estimated rate (per sec)'].median(),
size=6,
fill_color='orange',
line_color='black',
fill_alpha=0.8,
legend_label='median rate estimates',
)
p.legend.location = 'bottom_right'
bokeh.io.show(p)